# encoding: utf-8

"""
Seb Harrevelt 2017.

Analyzing the bio-text from the DIAG website.

This scripts uses various methods in order to gather information from a website and then displays this information.
The information is extracted using a simple requests.get method, where after a BeautifulSoup operation then information
is being analyzed by the coreNLP module provided by the University of Stanford.
After some preprocessing, we eventually obtain a list of edges plus some properties that belong to an edge.

The set of graphs that can be created show the relation between people from DIAG and
	- Universities;
	- Other people;
	- Other people from DIAG;
	- Organizations;
	- Dates;
	
These labels are the result of the NER-tagging in the coreNLP process. 

Finally the information is displayed with HTML + JavaScript code. In order for this to be an easily portable product, the 
data that is generated by this code is pasted into the .html script.

"""

from bs4 import BeautifulSoup as bS
import collections
import networkx as nx
import os
from pycorenlp import StanfordCoreNLP
import re
import requests
import unicodedata

# starting coreNLP server via the following command.
# for the latest version, see: https://stanfordnlp.github.io/CoreNLP/
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000
nlp = StanfordCoreNLP('http://localhost:9000')

# <editor-fold desc='Define the functions that we will need for this script..'>


def edge2graph(input_label, input_edges, input_diag_name, input_bio_dict):
    """
    Combines all the info in one graph object... This is a preprocessing step to store the info as a json.

    :param input_label: The input category we are going to prepocess
    :param input_edges: List of all edges that are found while analyzing the text
    :param input_diag_name: List of all diag people names
    :param input_bio_dict: Dict of the bio text of all the diag people
    :return:
    """
    ppl_label_edge_list = [x[:-1] for x in input_edges if input_label in x]
    edge_label_dict = get_edge_label(ppl_label_edge_list, input_diag_name, input_bio_dict)

    graph_obj = nx.Graph()
    for x in ppl_label_edge_list:
        if x in edge_label_dict:
            graph_obj.add_edge(x[0], x[1], attr_dict={'text': edge_label_dict[x]})
        else:
            graph_obj.add_edge(x[0], x[1], attr_dict={'text': ''})

        graph_obj.node[x[0]]['group'] = 'DIAG_PEOPLE'
        graph_obj.node[x[0]]['fill'] = 'black'
        graph_obj.node[x[1]]['group'] = input_label
        graph_obj.node[x[1]]['fill'] = 'blue'

    for i_node, i_weight in nx.degree(graph_obj).items():
        graph_obj.node[i_node]['value'] = i_weight

    return graph_obj


def graph2json(graph_obj, fname):
    """
    This will allow us to use the json in combination with d3.js to create some nice graphs..

    :param graph_obj: a graph object of networkx
    :param fname: the filename of the json
    :return:
    """

    import json
    from networkx.readwrite import json_graph
    temp_dict = json_graph.node_link_data(graph_obj)
    temp_links = temp_dict['links']
    temp_nodes = temp_dict['nodes']
    for i_dict in temp_links:
        x = temp_nodes[i_dict['source']]['id']
        y = temp_nodes[i_dict['target']]['id']
        temp_d = {'source': x, 'target': y}
        i_dict.update(temp_d)
    json.dump(temp_dict, open(fname, 'w'), indent=1)


def concat_function(input_list):
    """
    Function to concat the text elements of a list of tuples.

    :param input_list: list of tuples, second index is an entity, first index is a name
    :return: concat entity names
    """
    temp_string = ''
    prev_ent = None
    for x in input_list:
        if not prev_ent or x[1] == prev_ent:
            temp_string += ' ' + x[0]
        else:
            yield temp_string.strip(), prev_ent
            temp_string = x[0]
        prev_ent = x[1]

    yield temp_string.strip(), prev_ent


def extract_ner_openie(input_res):
    """
    A hack-y way to extract information from the coreNLP tool results

    :param input_res: a resulting object from the API call
    :return: two lists, containing NER information and openIE information resp.
    """
    openie_label = []
    ner_label = []
    for i_sent in input_res['sentences']:
        sent_openie_label = [(x['object'], x['subject'], x['relation']) for x in i_sent['openie']]
        sent_ner_label = [(x['originalText'], x['ner']) for x in i_sent['tokens']]
        sent_ner_label = [x for x in list(concat_function(sent_ner_label)) if x[1] != 'O']

        openie_label.extend(sent_openie_label)
        ner_label.extend(sent_ner_label)

    return ner_label, openie_label


def get_edge_relation(name, rel_list, bio_dict):
    """
    Here we can extract the relation between the tuples given by rel_list (relation list), using the OpenIE results

    :param name: name that is present in bio_dict (diag people)
    :param rel_list: list of relations
    :param bio_dict: dict with the bio info of the diag people
    :return: list of relations based on the given relation list and name..
    """
    res_relation = []
    sub_list = [x for x in rel_list if name in x]
    temp_openie_dict = bio_dict[name]['openie']
    for i_ner in sub_list:
        # Spot how the name in the bio is related to the entity
        i_entity = i_ner[1]
        # z = [x for x in temp_openie_dict if i_entity in x and name in x]
        w = [x for x in temp_openie_dict if i_entity in x]
        if w:
            res_relation.extend(w)
    return res_relation


def get_edge_label(sub_graph, source_list, input_bio_dict):
    """
    Here we get a dictionary with all the relations belonging to the sub_graph.
    We filter out the subjects where it says he/she and replace it with the full name.

    :param sub_graph: A list of edges (that can be seen as a subgraph of the full graph)
    :param source_list: A list of names
    :param input_bio_dict: A dict with the bio info of all the names
    :return: a dictionary with the edges along with the relation
    """
    temp_dict = {}
    for i_name in source_list:
        i_grep_name = re.compile('he|she|{0}'.format('|'.join(i_name.split())))
        res_rel = get_edge_relation(i_name, sub_graph, input_bio_dict)
        res = {(i_name, x[0]): x[2] for x in res_rel if i_grep_name.findall(''.join(x))}
        temp_dict.update(res)
    return temp_dict
# </editor-fold>

# <editor-fold desc='Requesting the data from the diag website'>
# Here we retrieve all the data and store it in a dict for later use.
# First we get all the names that are present in DIAG, then we create their unique url from which we gather the text.

base_url = 'http://www.diagnijmegen.nl/index.php/'
people_url = base_url + 'People'

people_url_resp = requests.get(people_url)
people_url_text = bS(people_url_resp.text, 'lxml')

people_tag_link = people_url_text.find_all(attrs={"href": re.compile(r".*"), 'class': 'personLink'})
people_url_link = [os.path.join(base_url, x['href'][2:]) for x in people_tag_link]
people_url_link_name = [(x, re.sub('.*=(.*)', '\\1', os.path.basename(x))) for x in people_url_link]

# Here we extract the bio-text that we have found online
people_bio_dict = {}
diag_name_list = []
for i_person_url, i_name in people_url_link_name:
    clean_name = i_name.replace('_', ' ')
    res = requests.get(i_person_url)
    temp = bS(res.text, 'lxml')
    bio_tag_text = temp.find_all(attrs={"class": 'personBio'})
    temp_text = [x.text for x in bio_tag_text]
    temp_text = unicodedata.normalize('NFKD', ' '.join(temp_text))
    temp_dict = {clean_name: {'text': temp_text}}
    diag_name_list.append(clean_name)
    people_bio_dict.update(temp_dict)
# </editor-fold>

# <editor-fold desc='Obtaining more info from the bio text via coreNLP and update the bio dict'>
# By using coreNLP we can gather relational data from the bio-text. This includes NER and openIE (open Information
# Extraction).
# With the result of the NER we can which institutes, organizations, people or dates are present in the bio text...
# and with openIE's result we can extract what the relation is between the diag person and the found entities.

for i_name, i_bio_dict in people_bio_dict.items():
    res = ''
    i = 0
    # Sometimes we need to recall the text and send it to the server to get a proper result
    # However we limit it to max 10 recalls for safety.
    while not res and i < 10:
        res = nlp.annotate(i_bio_dict['text'], properties={
            'annotators': 'openie, ner',
            'outputFormat': 'json'})
        i += 1
    if res:
        output_ner, output_openie = extract_ner_openie(res)
        temp_dict = {'ner': output_ner, 'openie': output_openie}
        people_bio_dict[i_name].update(temp_dict)
    else:
        print('We could not find results for {0}'.format(i_name))
# </editor-fold>

# <editor-fold desc='Parse all bio info text'>
# Here we parse all the information in a format that is suitable to create graphs from it

edge_list = []
ner_dict = {}
for i_name, i_bio_dict in people_bio_dict.items():
    # Connect all the names to the ner-entities
    edge_list.extend([(i_name, x[0]) for x in i_bio_dict['ner']])
    # Concat all the individual ner-dictionaries
    ner_dict.update(dict(i_bio_dict['ner']))

# Assuming that no entity has more than one ner-category...
# We update the edge list ner-values based on the final ner_dict mapping
# This makes it easier to filter certain edges later on...
edge_list = [(x[0], x[1], ner_dict[x[1]]) for x in edge_list]

# DQ check if we have all the names labeled correctly...
A = set(diag_name_list)
B = set(ner_dict.keys())
missing_entities = A.difference(B)  # Here we see that the ner_dict misses some names!
missing_ent_dict = dict(zip(missing_entities, ['PERSON'] * len(missing_entities)))
ner_dict.update(missing_ent_dict)

# Change labels for universities
uni_dict = {k: 'UNIVERSITY' for k, _ in ner_dict.items() if re.findall('uni', k, re.IGNORECASE)}
ner_dict.update(uni_dict)
edge_list = [(x, y, 'UNIVERSITY') if re.findall('uni', y, re.IGNORECASE) else (x, y, z) for x, y, z in edge_list ]

# Change labels for people in diag
diag_dict = {k: 'DIAG_PERSON' for k, _ in ner_dict.items() if k in diag_name_list}
ner_dict.update(diag_dict)
edge_list = [(x, y, 'DIAG_PERSON') if y in diag_name_list else (x, y, z) for x, y, z in edge_list]

# Remove references to the self...
edge_list = [x for x in edge_list if x[0] != x[1]]
# </editor-fold>

# <editor-fold desc='Creating graph objects'>
# Here we can play around with the data that we have gathered.
# The graphs contain a lot more information.. some of this is displayed in the given html.. some of it is not.

all_label = ['ORGANIZATION', 'PERSON', 'DIAG_PERSON', 'DATE', 'UNIVERSITY']

list_of_studies = []
list_of_joined_dates = []
for i_label in all_label:
    temp_graph = edge2graph(i_label, edge_list, diag_name_list, people_bio_dict)

    for k, v in temp_graph.edge.items():
        for ki, vi in v.items():
            if vi['text']:
                temp_text = vi['text']
                if 'studied' in temp_text:
                    temp_study = re.sub('studied (.*) at', '\\1', temp_text)
                    list_of_studies.append(temp_study)
                if 'joined' in temp_text and bool(re.search(r'\d', ki)):
                    list_of_joined_dates.append('-'.join(sorted(ki.split())))
                if 'supervised by' in temp_text:
                    print(ki,vi)

                # test.append()
                # print(vi['text'])

print("List of studies done by the DIAG people:")
for i in collections.Counter(list_of_studies).most_common(10):
    print('{0} by {1} people'.format(i[0], i[1]))

print("Order of years that people joined:")
for i in sorted(list_of_joined_dates):
    print(i)

joined_years = [re.sub('.*(\d{4}).*', '\\1', x) for x in list_of_joined_dates]
print("Most common years to join:")
for i in collections.Counter(joined_years).most_common(3):
    print('Year {0} with {1} people'.format(i[0], i[1]))

# </editor-fold>


